1 Setup

suppressPackageStartupMessages({
    library(data.table)
    library(DESeq2)
    library(gplots)
    library(here)
    library(hyperSpec)
    library(parallel)
    library(RColorBrewer)
    library(tidyverse)
    library(UpSetR)
    library(VennDiagram)
})
suppressMessages({
    source(here("UPSCb-common/Rtoolbox/src/plotEnrichedTreemap.R"))
    source(here("UPSCb-common/src/R/featureSelection.R"))
    source(here("UPSCb-common/src/R/volcanoPlot.R"))
    source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
threads=1
  1. plot specific gene expression
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
  message(paste("Plotting",gene_id))
  sel <- grepl(gene_id,rownames(vst))
  stopifnot(sum(sel)==1)
  
  p <- ggplot(bind_cols(as.data.frame(colData(dds)),
                        data.frame(value=vst[sel,])),
              aes(x=Time,y=value,col=Time,group=Time)) +
    geom_point() + geom_smooth() +
    scale_y_continuous(name="VST expression") + 
    ggtitle(label=paste("Expression for: ",gene_id))
  
  suppressMessages(suppressWarnings(plot(p)))
  return(NULL)
}
  1. extract the DE results. Default cutoffs are from Schurch et al., RNA, 2016
"extract_results" <- function(dds,vst,contrast,
                              padj=0.01,lfc=0.5,
                              plot=TRUE,verbose=TRUE,
                              export=TRUE,default_dir=here("data/analysis/DE"),
                              default_prefix="DE-",
                              labels=colnames(dds),
                              sample_sel=1:ncol(dds),
                              expression_cutoff=0,
                              debug=FALSE,filter=c("median",NULL),...){
    
    # get the filter
    if(!is.null(match.arg(filter))){
        filter <- rowMedians(counts(dds,normalized=TRUE))
        message("Using the median normalized counts as default, set filter=NULL to revert to using the mean")
    }
    
    # validation
    if(length(contrast)==1){
        res <- results(dds,name=contrast,filter = filter)
    } else {
        res <- results(dds,contrast=contrast,filter = filter)
    }
    
    stopifnot(length(sample_sel)==ncol(vst))
    
    if(plot){
        par(mar=c(5,5,5,5))
        volcanoPlot(res)
        par(mar=mar)
    }
    
    # a look at independent filtering
    if(plot){
        plot(metadata(res)$filterNumRej,
             type="b", ylab="number of rejections",
             xlab="quantiles of filter")
        lines(metadata(res)$lo.fit, col="red")
        abline(v=metadata(res)$filterTheta)
    }
    
    if(verbose){
        message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
                        round(metadata(res)$filterThreshold,digits=5),
                        names(metadata(res)$filterThreshold)))
        
        max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
        message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
                        round(max.theta*100,digits=5),
                        round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
    }
    
    if(plot){
        qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
        dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
                          qtl.exp=qtl.exp,
                          number.degs=sapply(lapply(qtl.exp,function(qe){
                              res$padj <= padj & abs(res$log2FoldChange) >= lfc & 
                                  ! is.na(res$padj) & res$baseMean >= qe
                          }),sum))
        if(debug){
            plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("base mean expression") +
                     geom_hline(yintercept=expression_cutoff,
                                linetype="dotted",col="red"))
        
            p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) + 
                geom_line() + geom_point() +
                scale_x_continuous("quantiles of expression") + 
                scale_y_log10("base mean expression") + 
                geom_hline(yintercept=expression_cutoff,
                           linetype="dotted",col="red")
            suppressMessages(suppressWarnings(plot(p)))
            
            plot(ggplot(dat,aes(x=thetas,y=number.degs)) + 
                     geom_line() + geom_point() +
                     geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Number of DE genes"))
            
            plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Cumulative number of DE genes"))
            
            plot(ggplot(data.frame(x=dat$thetas[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Number of DE genes per interval"))
            
            plot(ggplot(data.frame(x=dat$qtl.exp[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("base mean of expression") + 
                     scale_y_continuous("Number of DE genes per interval"))
            
            p <- ggplot(data.frame(x=dat$qtl.exp[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                geom_line() + geom_point() +
                scale_x_log10("base mean of expression") + 
                scale_y_continuous("Number of DE genes per interval") + 
                geom_vline(xintercept=expression_cutoff,
                           linetype="dotted",col="red")
            suppressMessages(suppressWarnings(plot(p)))
        }
    }
    
    sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) & 
        res$baseMean >= expression_cutoff
    
    if(verbose){
      message(sprintf(paste(
        ifelse(sum(sel)==1,
               "There is %s gene that is DE",
               "There are %s genes that are DE"),
        "with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s"),
        sum(sel),padj,
        lfc,expression_cutoff))
    }
    
    # proceed only if there are DE genes
    if(sum(sel) > 0){
        val <- rowSums(vst[sel,sample_sel,drop=FALSE])==0
        if (sum(val) >0){
          warning(sprintf(paste(
            ifelse(sum(val)==1,
                   "There is %s DE gene that has",
                   "There are %s DE genes that have"),
            "no vst expression in the selected samples"),sum(val)))
          sel[sel][val] <- FALSE
        } 

        if(export){
            if(!dir.exists(default_dir)){
                dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
            }
            write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
            write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
        }
        if(plot & sum(sel)>1){
            heatmap.2(t(scale(t(vst[sel,sample_sel]))),
                      distfun = pearson.dist,
                      hclustfun = function(X){hclust(X,method="ward.D2")},
                      trace="none",col=hpal,labRow = FALSE,
                      labCol=labels[sample_sel],...
            )
        }
    }
    return(list(all=rownames(res[sel,]),
                up=rownames(res[sel & res$log2FoldChange > 0,]),
                dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
  1. extract and plot the enrichment results
extractEnrichmentResults <- function(enrichment,task="go",
                                     diff.exp=c("all","up","dn"),
                                     go.namespace=c("BP","CC","MF"),
                                     genes=NULL,export=TRUE,plot=TRUE,
                                     default_dir=here("data/analysis/DE"),
                                     default_prefix="DE",
                                     url="athaliana"){
    # process args
    diff.exp <- match.arg(diff.exp)
    de <- ifelse(diff.exp=="all","none",
                 ifelse(diff.exp=="dn","down",diff.exp))

    # sanity
    if( is.null(enrichment[[task]]) | length(enrichment[[task]]) == 0){
        message(paste("No enrichment for",task))
    } else {

        # write out
        if(export){
            write_tsv(enrichment[[task]],
                      file=here(default_dir,
                                paste0(default_prefix,"-genes_GO-enrichment.tsv")))
            if(!is.null(genes)){
                write_tsv(
                    enrichedTermToGenes(genes=genes,terms=enrichment[[task]]$id,url=url,mc.cores=16L),
                    file=here(default_dir,
                              paste0(default_prefix,"-enriched-term-to-genes.tsv"))
                )
            }
        }
        
        if(plot){
            sapply(go.namespace,function(ns){
                titles <- c(BP="Biological Process",
                            CC="Cellular Component",
                            MF="Molecular Function")
                suppressWarnings(tryCatch({plotEnrichedTreemap(enrichment,enrichment=task,
                                                               namespace=ns,
                                                               de=de,title=paste(default_prefix,titles[ns]))},
                                          error = function(e) {
                                              message(paste("Treemap plot failed for",ns, 
                                                            "because of:",e))
                                              return(NULL)
                                          }))
            })
        }
    }
}
  1. get the background genes from the dds
getBgGenes <- function(dds,threads=1L){
  ifilt <- mclapply(resultsNames(dds),
                    function(nam,dds){
                      r<-results(dds,name=nam)
                      return(rownames(r)[!is.na(r$padj)])
                    },dds,mc.cores=threads)
  names(ifilt) <- resultsNames(dds)
  nobg <- Reduce(union,ifilt)
  upset(fromList(lapply(ifilt,function(i){which(nobg %in% i)})))
  return(nobg)
}
load(here("data/analysis/salmon/dds-sample-swap-corrected.rda"))

1.1 Update

  • Creating the response variable
dds$Response <- factor(levels=c("control","acute","early","late"),
                       sub(".*hrs","late",
                           gsub("12hrs|^4hrs","early",
                                sub("60min","acute",
                                    sub("std","control",dds$Time)))))

1.2 Normalisation for visualisation

vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/vst-aware.rda"))
write_delim(as.data.frame(vst) %>% rownames_to_column("ID"),
            here("data/analysis/DE/vst-aware.tsv"))

1.3 Gene of interests

#goi <- read_lines(here("doc/goi.txt"))
#stopifnot(all(goi %in% rownames(vst)))
#dev.null <- lapply(goi,line_plot,dds=dds,vst=vst)

1.4 Differential Expression

  • Updating the model
design(dds) <- ~Response
dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1358 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
  • Dispersion estimation The dispersion estimation is adequate
plotDispEsts(dds)

Check the different contrasts

resultsNames(dds)
## [1] "Intercept"                 "Response_acute_vs_control"
## [3] "Response_early_vs_control" "Response_late_vs_control"

1.5 Results

dir.create(here("data/analysis/DE/Response"))
## Warning in dir.create(here("data/analysis/DE/Response")):
## '/mnt/picea/home/delhomme/Git/algalAcclimatization/data/analysis/DE/Response'
## already exists

1.5.1 Acute vs. control

acute_vs_control <- extract_results(dds,vst,"Response_acute_vs_control",
                                    default_dir=here("data/analysis/DE/Response"),
                                    default_prefix="acute_vs_control_",
                                    sample_sel=dds$Response %in% c("acute","control"),
                                    labels=dds$Response)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## Loading required package: LSD

## The independent filtering cutoff is 0.2292, removing 28.34246% of the data
## The independent filtering maximises for 28.34246 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8336 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
## Warning in extract_results(dds, vst, "Response_acute_vs_control", default_dir =
## here("data/analysis/DE/Response"), : There are 6 DE genes that have no vst
## expression in the selected samples

1.5.2 Early vs. control

early_vs_control <- extract_results(dds,vst,"Response_early_vs_control",
                                    default_dir=here("data/analysis/DE/Response"),
                                    default_prefix="early_vs_control_",
                                    sample_sel=dds$Response %in% c("early","control"),
                                    labels=dds$Response)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean

## The independent filtering cutoff is 0.2292, removing 28.34246% of the data
## The independent filtering maximises for 28.34246 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 13498 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0

1.5.3 Late vs. control

late_vs_control <- extract_results(dds,vst,"Response_late_vs_control",
                                    default_dir=here("data/analysis/DE/Response"),
                                    default_prefix="late_vs_control_",
                                    sample_sel=dds$Response %in% c("late","control"),
                                    labels=dds$Response)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean

## The independent filtering cutoff is 0.2292, removing 28.34246% of the data
## The independent filtering maximises for 28.34246 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8716 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0

1.5.4 Venn Diagram

res.list <- list(acute=acute_vs_control,
                 early=early_vs_control,
                 late=late_vs_control)

1.5.4.1 All DE genes

grid.newpage()
grid.draw(venn.diagram(lapply(res.list,"[[","all"),
                      NULL,
                      fill=pal[1:3]))

1.5.4.2 DE genes (up in mutant)

grid.newpage()
grid.draw(venn.diagram(lapply(res.list,"[[","up"),
                      NULL,
                      fill=pal[1:3]))

1.5.4.3 DE genes (up in control)

grid.newpage()
grid.draw(venn.diagram(lapply(res.list,"[[","dn"),
                      NULL,
                      fill=pal[1:3]))

1.5.5 Gene Ontology enrichment

background <- getBgGenes(dds,threads=length(resultsNames(dds)))

enr.list <- lapply(res.list,function(r){
    lapply(r,gopher,background=background,task="go",url="algae")
})

dev.null <- lapply(names(enr.list),function(n){
    lapply(names(enr.list[[n]]),function(de){
        extractEnrichmentResults(enr.list[[n]][[de]],
                                 diff.exp=de,
                                 genes=res.list[[n]][[de]],
                                 default_prefix=paste(n,de,sep="-"),
                                 url="algae")
    })
})
## Loading required package: treemap

1.5.6 Visualisation

1.5.6.1 Export

dev.null <- lapply(list.files(here("data/analysis/DE"),pattern="*-genes_GO-enrichment.tsv",full.names=TRUE),
                   function(fil){
                     write_tsv(read_tsv(fil,col_select=c("id","padj"),
                                        show_col_types=FALSE),
                               here(sub("\\.tsv","_for-REVIGO.tsv",fil)),
                               col_names=FALSE)
                   })

2 Session Info

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] treemap_2.4-4               GO.db_3.18.0               
##  [3] AnnotationDbi_1.64.1        jsonlite_1.8.7             
##  [5] LSD_4.1-0                   limma_3.58.1               
##  [7] VennDiagram_1.7.3           futile.logger_1.4.3        
##  [9] UpSetR_1.4.0                lubridate_1.9.3            
## [11] forcats_1.0.0               stringr_1.5.0              
## [13] dplyr_1.1.3                 purrr_1.0.2                
## [15] readr_2.1.4                 tidyr_1.3.0                
## [17] tibble_3.2.1                tidyverse_2.0.0            
## [19] RColorBrewer_1.1-3          hyperSpec_0.100.0          
## [21] xml2_1.3.5                  ggplot2_3.4.4              
## [23] lattice_0.21-8              here_1.0.1                 
## [25] gplots_3.1.3                DESeq2_1.42.0              
## [27] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [29] MatrixGenerics_1.14.0       matrixStats_1.1.0          
## [31] GenomicRanges_1.54.1        GenomeInfoDb_1.38.0        
## [33] IRanges_2.36.0              S4Vectors_0.40.1           
## [35] BiocGenerics_0.48.1         data.table_1.14.8          
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.15.0       magrittr_2.0.3          farver_2.1.1           
##  [4] rmarkdown_2.25          zlibbioc_1.48.0         vctrs_0.6.4            
##  [7] memoise_2.0.1           RCurl_1.98-1.13         htmltools_0.5.7        
## [10] S4Arrays_1.2.0          lambda.r_1.2.4          curl_5.1.0             
## [13] SparseArray_1.2.1       sass_0.4.7              KernSmooth_2.23-22     
## [16] bslib_0.5.1             plyr_1.8.9              testthat_3.2.0         
## [19] futile.options_1.0.1    cachem_1.0.8            igraph_1.5.1           
## [22] mime_0.12               lifecycle_1.0.4         pkgconfig_2.0.3        
## [25] Matrix_1.6-0            R6_2.5.1                fastmap_1.1.1          
## [28] GenomeInfoDbData_1.2.11 shiny_1.7.5.1           digest_0.6.33          
## [31] colorspace_2.1-0        rprojroot_2.0.4         RSQLite_2.3.2          
## [34] labeling_0.4.3          fansi_1.0.5             timechange_0.2.0       
## [37] httr_1.4.7              abind_1.4-5             compiler_4.3.1         
## [40] bit64_4.0.5             withr_2.5.2             BiocParallel_1.36.0    
## [43] DBI_1.1.3               highr_0.10              DelayedArray_0.28.0    
## [46] gtools_3.9.4            caTools_1.18.2          tools_4.3.1            
## [49] httpuv_1.6.12           glue_1.6.2              promises_1.2.1         
## [52] gridBase_0.4-7          generics_0.1.3          gtable_0.3.4           
## [55] tzdb_0.4.0              hms_1.1.3               utf8_1.2.4             
## [58] XVector_0.42.0          pillar_1.9.0            vroom_1.6.4            
## [61] later_1.3.1             bit_4.0.5               deldir_1.0-9           
## [64] tidyselect_1.2.0        locfit_1.5-9.8          Biostrings_2.70.1      
## [67] knitr_1.45              gridExtra_2.3           xfun_0.41              
## [70] statmod_1.5.0           brio_1.1.3              stringi_1.7.12         
## [73] lazyeval_0.2.2          yaml_2.3.7              evaluate_0.23          
## [76] codetools_0.2-19        interp_1.1-4            cli_3.6.1              
## [79] xtable_1.8-4            munsell_0.5.0           jquerylib_0.1.4        
## [82] Rcpp_1.0.11             png_0.1-8               ellipsis_0.3.2         
## [85] blob_1.2.4              latticeExtra_0.6-30     jpeg_0.1-10            
## [88] bitops_1.0-7            scales_1.2.1            crayon_1.5.2           
## [91] rlang_1.1.2             KEGGREST_1.42.0         formatR_1.14